import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
with open('data/train.json') as data:
dishes = pd.read_json(data)
print("There are {0} total recipes.".format(len(dishes['id'].unique())))
dishes.head()
print("There are {0} total cuisines available.".format(len(dishes['cuisine'].unique())))
dishes['cuisine'].value_counts()
# Explode function in pandas 0.25
dishes = dishes.explode('ingredients')
print("There are {0} total unique ingredients.".format(len(dishes['ingredients'].unique())))
dishes.head()
def hbar_top_facet_grid(df, label, value, group, color='b'):
# Get only top 5
data = df.sort_values([group, value], ascending=False).groupby(group).head(5)
# To view all styles: plt.style.available
sns.set(style="white", font_scale = 1.9)
# https://cduvallet.github.io/posts/2018/11/facetgrid-ylabel-access
g = sns.FacetGrid(data.sort_values([group, value]), col=group, col_wrap=4, sharey=False, sharex=False,
height=4, aspect=1.5)
g = g.map(plt.barh, label, value, color=color)
g = g.set_titles(col_template="{col_name}", size=24)
g = g.set_xlabels(' ')
g = g.set_ylabels(' ')
# group by ingredient - cuisine
data = pd.DataFrame(dishes.groupby(['cuisine', 'ingredients']).size().reset_index(name='count'))
hbar_top_facet_grid(data, 'ingredients', 'count', 'cuisine')
# How frequent is a specific cuisine ingredient used among all cuisines?
# we take advantage of groupby index from
# https://stackoverflow.com/questions/23377108/pandas-percentage-of-total-with-groupby
data = (dishes.groupby(['ingredients', 'cuisine']).size() /
dishes.groupby('ingredients').size()).reset_index(name='relative_usage') #reset_index naturally creates df
data['relative_usage'] = data['relative_usage']*100
# Still not quite right, some ingredients are only used once or twice, so they have x/x = 1 -> close to 100% usage
# to know the bins: data['relative_usage'].value_counts(bins=10)
# two options:
# 1) keep ingredients appear in at least 10% of all recipes
# frequent = pd.DataFrame(dishes['ingredients'].value_counts(normalize=True).reset_index(name='density'))
# frequent_list = frequent[frequent['density'] >= .00001]['index'].tolist()
# data = data[data['ingredients'].isin(frequent_list)]
# 2) exclude relative usage < 90% - the top rarest
data = data[data['relative_usage'] < 90]
# redraw
hbar_top_facet_grid(data, 'ingredients', 'relative_usage', 'cuisine', color='orange')
Much more interesting!
import json
import itertools
import networkx as nx
with open('data/train.json') as data:
dishes_json = json.load(data)
def get_combinations(l):
'''input list of items and output all pair edges'''
g = []
for i in itertools.combinations(l ,2): #get all combos
g.append(i)
return g
def get_cuisine_top_ingr(pct):
'''get dict of ingredients appear in at least {pct} of EACH cuisine '''
data = (dishes.groupby(['cuisine', 'ingredients']).size() /
dishes.groupby(['cuisine']).size()).reset_index(name='pct') #get pct
top_ingr = data[data['pct'] > pct] #filter
top_ingr_dict = {}
for key, g in top_ingr.groupby('cuisine'):
top_ingr_dict[key] = g['ingredients'].tolist()
return top_ingr_dict
def filter_node_df(df, top_ingr):
'''filter node df by top ingredient dict'''
dfs = []
for key, g in df.groupby('cuisine'):
dfs.append(g[g['ingredients'].isin(top_ingr[key])])
df = pd.concat(dfs)
print("Nodes: {0}".format(df.shape[0]))
return df
def filter_edge_df(df, top_ingr):
'''filter edge df by top ingredient dict'''
dfs = []
for key, g in df.groupby('cuisine'):
dfs.append(g[g['source'].isin(top_ingr[key]) & g['target'].isin(top_ingr[key])])
df = pd.concat(dfs)
print("Edges: {0}".format(df.shape[0]))
return df
def draw_network(n, e):
G = nx.from_pandas_edgelist(e, 'source', 'target')
nx.draw(G, with_labels=True, node_color="skyblue", pos=nx.random_layout(G), node_size=n['size'])
plt.show()
# Get edge df
new_dict = [] #start with dict
for dish in dishes_json:
edges_list = get_combinations(dish['ingredients'])
for edges in edges_list:
new_dict.append({'id':dish['id'], 'cuisine':dish['cuisine'], 'source': edges[0], 'target': edges[1]})
edge_df = pd.DataFrame.from_dict(new_dict).groupby(['source', 'target', 'cuisine']).size().reset_index(name='count')
# Get node df
node_df = dishes.groupby(['cuisine', 'ingredients']).size().reset_index(name='size')
print("Nodes: {0}".format(node_df.shape[0])) #too many nodes + edges
print("Edges: {0}".format(edge_df.shape[0]))
# Filter
node_df = filter_node_df(node_df, get_cuisine_top_ingr(0.007)) #at least 0.7%
edge_df = filter_edge_df(edge_df, get_cuisine_top_ingr(0.007))
# Draw plt networkx
for cuisine in ['vietnamese', 'chinese', 'mexican']:
n = node_df[node_df['cuisine'] == cuisine]
e = edge_df[edge_df['cuisine'] == cuisine]
plt.rcParams['figure.figsize'] = (8, 6) #set figsize for all
draw_network(n, e)
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=False)
def draw_plotly_network(n, G, title='Cuisine Ingredient Network', file='network'):
'''draw from networkx'''
# First, get pos dict for graph eg.{'beansprouts': [x, y]...} and node dict for hovertxt
pos_dict = nx.spring_layout(G) #Spring algorithm
node_dict = n.set_index('ingredients').to_dict('index') #get node info
# Assign edges
edge_x = []
edge_y = []
for edge in G.edges():
x0, y0 = (pos_dict[edge[0]][0], pos_dict[edge[0]][1])
x1, y1 = (pos_dict[edge[1]][0], pos_dict[edge[1]][1])
edge_x.append(x0)
edge_x.append(x1)
edge_x.append(None)
edge_y.append(y0)
edge_y.append(y1)
edge_y.append(None)
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
line=dict(width=0.5, color='#888'),
hoverinfo='none',
mode='lines')
#Assign nodes
node_x = []
node_y = []
sizes = []
hover_txt = []
for node in node_dict.keys():
x, y = pos_dict[node][0], pos_dict[node][1]
size = node_dict[node]['size']
txt = '{0} <br> {1}'.format(str(node).capitalize(), node_dict[node]['size'])
node_x.append(x)
node_y.append(y)
sizes.append(size)
hover_txt.append(txt)
node_trace = go.Scatter(
x=node_x, y=node_y,
mode='markers+text',
text=hover_txt,
hoverinfo='text',
marker=dict(
colorscale='Picnic',
reversescale=True,
color=list(np.random.choice(range(256), size=len(node_x))),
size=sizes,
sizeref=2.*max(sizes)/(15.**2), #scale by 2. * max(array of size values) / (desired maximum marker size ** 2)
line_width=2)
)
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title=title,
titlefont_size=16,
showlegend=False,
plot_bgcolor='rgba(0,0,0,0)', #set bg colour
hovermode='closest',
annotations=[ dict(
text="Only include ingredients that appear in 0.7% of cuisine recipes. Data Source: <a ref=https://www.kaggle.com/c/whats-cooking>Yummly.</a>",
showarrow=False,
xref="paper", yref="paper",
x=0.005, y=-0.002 ) ],
margin=dict(b=20,l=5,r=5,t=40),
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
)
# fig.write_image("fig/{0}.png".format(file))
fig.show()
# get total number of recipes per cuisine
recipe_total_dict = dict(dishes.groupby('cuisine').size())
for c in node_df['cuisine'].unique():
e, n = (edge_df[edge_df['cuisine'] == c], node_df[node_df['cuisine'] == c])
#Initialize G
G = nx.from_pandas_edgelist(e, 'source', 'target')
draw_plotly_network(n=n, G=G,
title='{0} Cuisine Ingredient Network ({1} recipes)'.format(c.capitalize(), recipe_total_dict[c]),
file=c)